from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from lxml import etree
import re
import time
from copy import deepcopy


driver = webdriver.Chrome()


class TrainSpider(object):
    login_url = "https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Fi.taobao.com%2Fmy_taobao.htm%3Fspm%3Da21bo.2017.1997525045.1.5af911d96Fx0mM"
    personal_url = "https://i.taobao.com/my_taobao.htm"
    search_start_url = "https://www.taobao.com/"
    search_url = "https://s.taobao.com/search"
    page = 1
    key_word = ''

    sql_data = dict(
        goods_name='',  # 商品名称
        price='',  # 商品价格
        sales='',  # 销售量
        icon_service_free=''  # 是否包邮
    )

    def login(self):
        self.key_word = input("请输入你要爬取信息的商品关键字：")
        driver.get(self.login_url)
        # 等待url是否变成个人中心的url，来判断是否登录成功
        WebDriverWait(driver,1000).until(
            EC.url_contains(self.personal_url)
        )
        print("登录成功！")

    def search_start(self):
        driver.get(self.search_start_url)
        print('成功登陆首页')
        from_station_input = driver.find_element_by_id("q")
        # driver.execute_script("花盆", from_station_input)
        from_station_input.send_keys(self.key_word)

        search_btn = driver.find_element_by_xpath("//button[@class = 'btn-search tb-bg']")
        search_btn.click()

        WebDriverWait(driver, 1000).until(
            EC.url_contains(self.search_url)
        )
        time.sleep(4)

        html = driver.page_source
        html_l = etree.HTML(html)
        pages = html_l.xpath('//div[@class ="total"]/text()')
        page = re.findall('\d+', pages[0])
        self.page = int(page[0])

    def search_working(self):
        html = driver.page_source
        sql_data = deepcopy(self.sql_data)
        soup = BeautifulSoup(html, 'lxml')
        soup_divs = soup.find_all('div', attrs={'data-category': "auctions"})
        for i in range(len(soup_divs)):
            soup_names = soup_divs[i].find('div', attrs={'class': 'row row-2 title'})
            soup_elements = soup_divs[i].find('div', attrs={'class': "row row-1 g-clearfix"})
            a_s = soup_names.a
            strong_s = soup_elements.strong
            sales = soup_elements.find('div', attrs={'class': "deal-cnt"})
            sale = re.findall('(\d+\.*\d*万*\+*)人', sales.string)
            if soup_elements.find('div', attrs={'class': "ship icon-service-free"}):
                sql_data['icon_service_free'] = '是'
            else:
                sql_data['icon_service_free'] = '否'
            sql_data['sales'] = sale[0]
            sql_data['price'] = strong_s.string
            b = []
            for string in a_s.strings:
                # print(repr(string))
                # name_s = re.findall(r'[\u4e00-\u9fa5]+', string)
                name_s = re.findall('\S', string)
                name = ''.join(name_s).strip()
                b.append(name)
                goods_name = ''.join(b).strip()
            sql_data['goods_name'] = goods_name
            print(sql_data)

        search_btn = driver.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]')
        search_btn.click()
        WebDriverWait(driver, 1000).until(
            EC.url_contains(self.search_url)
        )
        time.sleep(4)

        # result = etree.tostring(html_l).decode('utf-8')
        # print(result)
        # print('-' * 50)
        # with open('taobao.html','w') as f:
        #     f.writelines(str(result))
        #     f.close()
        # page = html_l.xpath('//div[@class ="total"]/text()')
        # things = html_l.xpath('/div[@class="row row-2 title"]/a')
        # print(things)
        # soup = soup.find_all('div', attrs={'class': 'J_ClickStat'})
        # a_s = soup.find_all('a', attrs={'class': 'J_ClickStat'})
        # a = []
        # b = []
        # print(things)
        # for thing in things:
        #     names = thing.xpath('/text()')
        #     for name_s in names:
        #         name_s = re.findall(r'[\u4e00-\u9fa5]+', name_s)
        #         name_s = ''.join(name_s)
        #         a.append(name_s)
        #     name = ''.join(a)
        #     b.append(name)
        # print(b)
        # print(page)
        # print('-' * 50)
        # print(thing)
        # print('-' * 50)
        # page1 = re.findall('\d+', page[0])
        # print(page1)
        # print('-' * 50)
        # print(result)
        # print('-' * 50)

    def run(self):
        # 1. 登录
        self.login()
        # 2. 进入首页
        self.search_start()
        for i in range(self.page):
            self.search_working()
        driver.close()


def main():
    spider = TrainSpider()
    spider.run()


if __name__ == '__main__':
    main()